#' ---
#' title: "Agenda Seeding: County Demographic Data Prep"
#' date: "`r Sys.Date()`"
#' output: html_document
#' header-includes:
#'  - \usepackage{booktabs}
#'  - \usepackage{longtable}
#'  - \usepackage{array}
#'  - \usepackage{multirow}
#'  - \usepackage{wrapfig}
#'  - \usepackage{float}
#'  - \usepackage{colortbl}
#'  - \usepackage{pdflscape}
#'  - \usepackage{tabu}
#'  - \usepackage{threeparttable}
#'  - \usepackage{threeparttablex}
#'  - \usepackage[normalem]{ulem}
#'  - \usepackage{makecell}
#'  - \usepackage{dcolumn}
#'  - \usepackage{setspace}\doublespacing
#' ---


## ---- demo_spin_code, eval = FALSE, include = FALSE ---- 
# spin code to output Rmd / Rnw
# set output_format to "html_document" for html
# rmarkdown::render(input = here::here("code", "demographic_data_prep3.R"), output_format = "html_document", clean = TRUE)


## ---- demo_setup1, include = FALSE ----
library(knitr)
library(here) # checked read., read_, load(, save(, source(



## ---- demo_load_packages, include = FALSE ----

# data loading
library(here)
library(foreign)
library(haven)

# scrubbing
library(plyr)
library(reshape2)
library(stringr)
library(janitor)
library(dataMaid)

# modeling
library(plm)
library(stargazer)

# imputation / interpolation / missingness
library(naniar)
library(simputation)
library(imputeTS)
library(Amelia)

# package management
library(conflicted)
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
conflict_prefer("mutate", "dplyr")
conflict_prefer("rename", "dplyr")
conflict_prefer("here",   "here")
conflict_prefer("arrange", "dplyr")



set.seed(12345)



## ---- demo_load_custom_functions, include = FALSE ----

read.dct <- function(dct, labels.included = "no") {
    temp <- readLines(dct)
    temp <- temp[grepl("_column", temp)]
    switch(labels.included,
           yes = {
               pattern <- "_column\\(([0-9]+)\\)\\s+([a-z0-9]+)\\s+(.*)\\s+%([0-9]+)[a-z]\\s+(.*)"
               classes <- c("numeric", "character", "character", "numeric", "character")
               N <- 5
               NAMES <- c("StartPos", "Str", "ColName", "ColWidth", "ColLabel")
           },
           no = {
               pattern <- "_column\\(([0-9]+)\\)\\s+([a-z0-9]+)\\s+(.*)\\s+%([0-9]+).*"
               classes <- c("numeric", "character", "character", "numeric")
               N <- 4
               NAMES <- c("StartPos", "Str", "ColName", "ColWidth")
           })
    metadata <- setNames(lapply(1:N, function(x) {
        out <- gsub(pattern, paste("\\", x, sep = ""), temp)
        out <- gsub("^\\s+|\\s+$", "", out)
        out <- gsub('\"', "", out, fixed = TRUE)
        class(out) <- classes[x] ; out }), NAMES)
    
    metadata[["ColName"]] <- make.names(gsub("\\s", "", metadata[["ColName"]]))
    
    return(metadata)
}

read.dat.dct <- function(dat, dct, labels.included = "no") {
    temp <- readLines(dct)
    temp <- temp[grepl("_column", temp)]
    switch(labels.included,
           yes = {
               pattern <- "_column\\(([0-9]+)\\)\\s+([a-z0-9]+)\\s+(.*)\\s+%([0-9]+)[a-z]\\s+(.*)"
               classes <- c("numeric", "character", "character", "numeric", "character")
               N <- 5
               NAMES <- c("StartPos", "Str", "ColName", "ColWidth", "ColLabel")
           },
           no = {
               pattern <- "_column\\(([0-9]+)\\)\\s+([a-z0-9]+)\\s+(.*)\\s+%([0-9]+).*"
               classes <- c("numeric", "character", "character", "numeric")
               N <- 4
               NAMES <- c("StartPos", "Str", "ColName", "ColWidth")
           })
    metadata <- setNames(lapply(1:N, function(x) {
        out <- gsub(pattern, paste("\\", x, sep = ""), temp)
        out <- gsub("^\\s+|\\s+$", "", out)
        out <- gsub('\"', "", out, fixed = TRUE)
        class(out) <- classes[x] ; out }), NAMES)
    
    metadata[["ColName"]] <- make.names(gsub("\\s", "", metadata[["ColName"]]))
    
    myDF <- read.dta(dat)
#     myDF <- read.fwf(dat, widths = metadata[["ColWidth"]], 
#                      col.names = metadata[["ColName"]])
#     if (labels.included == "yes") {
#         attr(myDF, "col.label") <- metadata[["ColLabel"]]
    names(myDF) <- metadata$ColName
#    }
    myDF
}

name.clean <- function(x) {
    x <- tolower(x)
    
    x <- str_replace(x, "per 1,000 population", "rate")
    x <- str_replace(x, "civilian labor force aged ", "")
    x <- str_replace_all(x, " of ", " ")
    x <- str_replace_all(x, "\\(", "")
    x <- str_replace_all(x, "\\)", "")
    x <- str_replace_all(x, "\\$", "")
    x <- str_replace_all(x, "\\,", "")
    x <- str_replace_all(x, "\\&", "")
    x <- str_replace(x, " $", "")
    
    x <- str_replace(x, "%", "per")
    x <- str_replace(x, " w/ ", " ")
    x <- str_replace(x, "population", "pop")
    x <- str_replace(x, "total", "tot")
    x <- str_replace(x, "crude ", "")
    x <- str_replace(x, "local gov'ts", "locgov")
    x <- str_replace(x, "negro", "black")
    x <- str_replace(x, ", 19", "")
    x <- str_replace(x, "16\\+", " ")
    x <- str_replace(x, "\\+", "plus")
    x <- str_replace_all(x, "\\=", "eq")
    
    x <- str_replace_all(x, " 19([0-9]+)$", "\\1")
    x <- str_replace_all(x, "  ", " ")
    x <- str_replace_all(x, "  ", " ")
    x <- str_replace_all(x, " ", "_")
    x <- str_replace(x, "_$", "")
    x <- str_replace_all(x, "\\/", "_")
    x <- str_replace(x, ":_", "_")
    x <- str_replace(x, "employed", "emp")
    x <- str_replace(x, "agriculture", "agr")
    x <- str_replace(x, "mineral_industries", "minind")
    x <- str_replace(x, "selected_services", "selserv")
    x <- str_replace(x, "hospital", "hosp")
    x <- str_replace(x, "retail_trade", "retail")
    x <- str_replace(x, "wholesale_trade", "whole")    
    x <- str_replace(x, "manufacturing", "manuf")
    x <- str_replace(x, "annual", "ann")
    #    x <- str_replace(x, "local_govt_finances", "locgov")
    x <- str_replace(x, "civilian", "civ")
    x <- str_replace(x, "employees", "emp")
    x <- str_replace(x, "average", "avg")
    x <- str_replace(x, "median", "med")
    x <- str_replace(x, "occupied_housing_units", "housing")
    x <- str_replace(x, "year-round_housing_units", "housing")
}


fiver <- function(x) {
    x %>%
        as.character() %>%
        str_pad(
            string = .,
            width  = 5,
            side   = "left",
            pad    = "0"
        )
}

# create function to detect any NAs
any_na <- function(x) {any(is.na(x))}


## ---- load_census62_data, include = FALSE ----
# /ICPSR_02896 Historical, Demographic, Economic, and Social Data The United States, 1790-2002")

# leading_paper/data/ICPSR_02896/ICPSR_02896_0074/DS0074/02896-0074-Data.dta
county62 <- read.dta(here("data/ICPSR_02896/ICPSR_02896_0074_1962/DS0074/02896-0074-Data.dta"))

dim(county62)

# load variable names
# /Users/owasow/GitHub/leading_paper/data/ICPSR_02896/census1962variablenames3.txt
countynames62 <- read.table(here("data/ICPSR_02896/census1962variablenames3.txt"), sep = "\t", col.names = c("varnum", "name"), colClasses = c("character", "character") )

## ---- clean_census62_data, include = FALSE ----

# extract and drop state-level rows and 1 nation-level row
state_level_rows <- which(county62$county == 0)
length(state_level_rows)

state62  <- county62[state_level_rows, ]

county62 <- county62[-state_level_rows, ]

dim(county62)

# clean four digit fips codes
nchar(county62$fips) %>% table()
county62$fips <- fiver(county62$fips)

# drop AK and HI
ak_hi <- which(county62$statefip == 2 | county62$statefip == 15)
county62 <- county62[-ak_hi, ]

# drop Yellowstone National Park
yellow <- which(county62$fips == "56047")
county62 <- county62[-yellow, ]
# 16)	Yellowstone National Park (Part), Wyoming (56047) was dropped as an entity.  The county was dropped from AHRF, and all non zero fields were weighted by population and split between Park County, Wyoming (56029), and Teton County, Wyoming (56039).

dim(county62) # 3104
head(names(county62), 20)


## For the variables that begin with "var" replace using descriptive countynames
## County cols 1-9 are keys and geographic info
var_cols <- grep("var", names(county62))
length(var_cols)
names(county62)[var_cols] <- countynames62$name
county62 <- county62 %>% clean_names()

# correct miscoded per_pop_for_born
over100 <- which(county62$per_pop_for_born > 100)

county62$per_pop_for_born[over100] <- county62$per_pop_for_born[over100] / 100


county62s <- county62[, c("fips", "statefip", "counfip", "tot_pop", "per_urban", "per_rural_farm", "per_nonwhite", "med_age", "per_pop_growth", "per_pop_for_born", "tot_livebirths", "tot_marriages", "tot_families", "med_income", "per_income_3000", "per_income_10000plus", "med_school_25plus", "per_less4_school", "per_12plus_school", "per_migrants", "per_unemp", "tot_housing_units", "owner_housing", "locgov_tot_exp"  )]

names(county62s) <- c("fips", "fips_st", "fips_cty", "tot_pop60", "per_urban60", "per_rural_farm60", "per_black60", "med_age60", "per_pop_growth60", "per_pop_foreign60", "tot_livebirths60", "tot_marriages60", "tot_families60", "med_income60", "per_income_300060", "per_income_10000plus60", "med_school60", "per_less4_school60", "per_hsplus60", "per_migrants60", "per_unemp60", "tot_housing_units60", "owner_housing60", "locgov_tot_exp60"  )

county62s$birth_rate60 <- (county62s$tot_livebirths60 * 1000) / county62s$tot_pop60
summary(county62s$birth_rate60)

county62s$locgov_exp_pc60 <- (county62s$locgov_tot_exp60 * 1000) / county62s$tot_pop60
summary(county62s$locgov_exp_pc60)

county62s$per_owner_housing60 <- (county62s$owner_housing60 / county62s$tot_housing_units60) * 100

# # correct miscoded per_hsplus60
over100 <- which(county62s$per_hsplus60 > 100)

county62s$per_hsplus60[over100] <- county62s$per_hsplus60[over100] / 100


summary(county62s$per_owner_housing60)

summary(county62s$per_black60)

summary(county62s$per_unemp60)

summary(county62s$per_hsplus60)

summary(county62s$per_pop_growth60)


## ---- impute_census62_missingness, include = FALSE ----

# explore missingness
gg_miss_var(county62s)

# which counties are missing locgov_tot_exp
miss <- which(is.na(county62s$locgov_tot_exp60) )

## assign locgov for manhattan to other four boros
missny <- miss[grep("BRONX|QUEENS|KINGS|RICHMOND", county62s$name[miss])]
ny     <- which(county62s$name == "NEW YORK")
county62s$locgov_tot_exp60[missny] <- county62s$locgov_tot_exp60[ny]



## ---- merged_counties_recode, include = FALSE, eval = TRUE ----

miss_ormsby <- which(county62s$fips == "32025")
county62s[miss_ormsby,] 

county62s[miss_ormsby, c("fips", "fips_st", "fips_cty")] <-
    data.frame(
        fips = "32510",
        fips_st = 32,
        fips_cty = 510,
        stringsAsFactors = FALSE
    )

######## VA

# 51129, "Norfolk County"
# 51785, "South Norfolk city"
# merge into
# 51550, "Chesapeake city"

old_ctys <- county62s %>% filter(fips == "51129" | fips == "51785")

# store order of columns
order_cols <- names(old_ctys)

# structure of data merge
ids     <- c("fips", "fips_st", "fips_cty")
weights <- "tot_pop60"
sums    <- c("tot_pop60", "tot_livebirths60", "tot_marriages60", "tot_families60",
           "tot_housing_units60", "owner_housing60", "locgov_tot_exp60")
avgs    <- base::setdiff(names(old_ctys), c(ids, sums))

sum_cols <- old_ctys %>% 
    summarize_at(
      sums, .funs = function(x) sum(x, na.rm = TRUE)
    ) 

avg_cols <- old_ctys %>% 
    summarize_at(
      avgs, .funs = function(x) weighted.mean(x, w = .$tot_pop60)
    )

id_cols <- data.frame(fips = "51550", fips_st = 51, fips_cty = 550,
                      stringsAsFactors = FALSE)

new_cty <- bind_cols(id_cols, sum_cols, avg_cols) %>% 
    select(order_cols)

county62s <- bind_rows(county62s, new_cty)

# drop old counties
county62s <- county62s %>% filter(fips != "51129" & fips != "51785")


######## VA

# 51151       ---       ---       ---     76124 Princess Anne County
# merged into the city of Virginia Beach
# 51810    393069    262199    172106      8091 Virginia Beach city

# https://en.wikipedia.org/wiki/Princess_Anne_County,_Virginia



old_ctys <- county62s %>% filter(fips == "51151" | fips == "51810")

sum_cols <- old_ctys %>% 
    summarize_at(
        sums, .funs = function(x) sum(x, na.rm = TRUE)
    ) 

avg_cols <- old_ctys %>% 
    summarize_at(
        avgs, .funs = function(x) weighted.mean(x, w = .$tot_pop60)
    )

id_cols <- data.frame(fips = "51810", fips_st = 51, fips_cty = 810,
                      stringsAsFactors = FALSE)

new_cty <- bind_cols(id_cols, sum_cols, avg_cols) %>% 
    select(order_cols)

# check rows, old-orig, new-sum/average, old-orig
bind_rows(old_ctys, new_cty)[c(1,3,2), ]

county62s <- bind_rows(county62s, new_cty)

# drop old counties
county62s <- county62s %>% filter(fips != "51151" & fips != "51810")



dim(county62s) # 3102





## ---- load_census72, include = FALSE ----


county72 <- read.dta(here("data/ICPSR_02896/ICPSR_02896_0076_1972/DS0076/02896-0076-Data.dta"))

county72$fips <- fiver(county72$fips)

countynames72 <- read.dct(here("data/ICPSR_02896/ICPSR_02896_0076_1972/DS0076/02896-0076-Setup.dct"), labels.included = "yes")

countynames72 <- data.frame(countynames72)

# snakecase and shorten labels
names(county72) <- name.clean(countynames72$ColLabel)

# drop state-level and national rows
counties <- which(county72$icpsr_county_state_code != 0) 
length(counties)
county72 <- county72[counties, ]
dim(county72)

# drop AK and HI
ak_hi <- which(county72$fips_state_code == 2 | county72$fips_state_code == 15)
county72 <- county72[-ak_hi, ]
dim(county72) # 3107


## ---- census72_missingness, include = FALSE ----

miss <- which(is.na(county72$locgov_per_capita_general_expenditure_exclu._capital_outlay_67) )

## assign locgov for manhattan to other four boros
missny <- miss[grep("BRONX|QUEENS|KINGS|RICHMOND", county72$name_state_county[miss])]
ny <- which(county72$name_state_county=="NEW YORK")
county72$locgov_per_capita_general_expenditure_exclu._capital_outlay_67[missny] <- county72$locgov_per_capita_general_expenditure_exclu._capital_outlay_67[ny]




## ---- county72_subset, include = FALSE ----

county72s <- county72[,c("state_county_fips_code", "fips_state_code", "fips_county_code", "tot_pop70", "per_pop_growth_1960-1970", "per_female70", "per_urban70", "white_pop70", "black_pop70",  "per_change_in_black_pop_1960-1970", "med_age_years70", "per_pop_foreign_stock70", "per_persons_spanish_heritage70", "birth_rate_rate68", "med_years_schooling_complted_persons_25plus70", "per_persons_25plus_12_or_more_yrs_schooling70", "per_unemp70", "med_family_income_all_families69",   "housing_per_owner-occupied70",  "locgov_per_capita_general_expenditure_exclu._capital_outlay_67")]

names(county72s) <- c("fips", "fips_st", "fips_cty", "tot_pop70", "per_pop_growth70", "per_female70", "per_urban70", "white_pop70", "black_pop70",  "per_change_in_black_pop70", "med_age70", "per_pop_foreign70", "per_spanish70", "birth_rate68", "med_school70", "per_hsplus70", "per_unemp70", "med_income69",  "per_owner_housing70",  "locgov_exp_pc67")
# "med_income_white69", "med_income_black69",
names(county72s)

summary(county72s$locgov_exp_pc67)

county72s$nonwhite_pop70 <- county72s$tot_pop70 - county72s$white_pop70
summary(county72s$nonwhite_pop70)

county72s$span_pop70 <- county72s$tot_pop70 * (county72s$per_spanish70 / 100) 
summary(county72s$span_pop70)

missing.span <- which(is.na(county72s$span_pop70))

county72s$black_est_pop70 <- county72s$tot_pop70 - county72s$white_pop70
summary(county72s$black_est_pop70)

county72s$black_est_pop70[!missing.span] <- county72s$span_pop70[!missing.span]
summary(county72s$black_est_pop70)

county72s$per_black70 <- (county72s$black_est_pop70 / county72s$tot_pop70) * 100
summary(county72s$per_black70)

summary(county72s$per_hsplus70)

summary(county72s$per_unemp70)

summary(county72s$per_owner_housing70)

summary(county72s$per_pop_growth70)



## ---- load_census_county83, include = FALSE ----

county83 <- read.dta(here("data/ICPSR_02896/ICPSR_02896_0078_1983/DS0078/02896-0078-Data.dta"))

county83$fips <- fiver(county83$fips)

countynames83 <- read.dct(here("data/ICPSR_02896/ICPSR_02896_0078_1983/DS0078/02896-0078-Setup.dct"), labels.included = "yes")

countynames83 <- data.frame(countynames83)

dim(county83)

names(county83) <- name.clean(countynames83$ColLabel)


# drop flag columns
flags <- grep("^flag", names(county83) )
head(flags)
county83 <- county83[, -flags]

# drop non-county rows
names(county83)[1:10]
counties <- which(county83$countyeq1_stateeq2_usaeq3==1)
length(counties)

county83 <- county83[counties, ]
dim(county83)
#names(county83)

# drop AK and HI
ak_hi <- which(county83$fips_state_code == 2 | county83$fips_state_code == 15)
length(ak_hi)
county83 <- county83[-ak_hi, ]

dim(county83) # 3109



## ---- census83_missingness, include = FALSE, eval = FALSE ----

miss <- which(is.na(county83$direct_general_expenditure_local_government_per_capita_77) )


## assign median to all other NA locgov (all Alaska so doesn't really matter)
#missother <- setdiff(miss, c(missny) )
county83$direct_general_expenditure_local_government_per_capita_77[miss] <- median(county83$direct_general_expenditure_local_government_per_capita_77, na.rm=T)



## ---- census83_subset, include = FALSE ----

county83s <- county83[, c("state_county_fips_code", "fips_state_code", "fips_county_code", "tot_pop_april_170", "tot_pop_april_180", "urban_pop80", "white_pop80", "black_pop80", "american_indians_eskimos_aleuts80", "asian_pacific_islanders80", "spanish-origin_pop80", "female_pop80", "med_age80", "births_rate80", "marriages_rate80", "tot_housing_april_180.1", "owner-housing_april_180", "civ_labor_force_persons_aged_80", "unemp_persons_in_the_civ_labor_force_persons_aged_80", "med_family_money_income_79", "direct_general_expenditure_local_government_per_capita_77", "education_attainment_persons_aged_25plus_completing_4_years_hs_or_more80", "persons_aged_18_years_over80") ]

#
names(county83s) <- c("fips", "fips_st", "fips_cty", "tot_pop70", "tot_pop80", "urban_pop80", "white_pop80", "black_pop80", "native_pop80", "asian_pop80", "spanish_pop80", "female_pop80", "med_age80", "birth_rate80", "marriages80", "tot_housing80", "owner_housing80", "labor_force80", "unemp80", "med_income79", "locgov_exp_pc77", "persons_12plus80", "eighteenplus_pop80") 


county83s$per_pop_growth80 <- ((county83s$tot_pop80 - county83s$tot_pop70) / county83s$tot_pop70) * 100
summary(county83s$per_pop_growth80)

county83s$per_black80 <- (county83s$black_pop80 / county83s$tot_pop80) * 100
summary(county83s$per_black80) 

county83s$per_urban80 <- (county83s$urban_pop80 / county83s$tot_pop80) * 100
summary(county83s$per_urban80)

county83s$per_unemp80 <- (county83s$unemp80 / county83s$labor_force80) * 100
summary(county83s$per_unemp80)


county83s$per_hsplus80 <- (county83s$persons_12plus80 / county83s$eighteenplus_pop80) * 100
summary(county83s$per_hsplus80)

county83s$per_owner_housing80 <- (county83s$owner_housing80 / county83s$tot_housing80) *100
summary(county83s$per_owner_housing80)

county83s <- county83s[,-grep("tot_pop70", names(county83s)) ]


## ---- merge_three_census_sets, include = FALSE ----

names(county62s)
dim(county62s)

names(county72s)
dim(county72s)

names(county83s)
dim(county83s)

county2 <- left_join(county62s, county72s, 
                     by=c("fips", "fips_st", "fips_cty"))

dim(county2)
names(county2)

county2 <- left_join(county2, county83s, 
                     by=c("fips", "fips_st", "fips_cty"))



## ---- build_per_pop_foreign80, include = FALSE ----

county94 <- read.dta(here("data/ICPSR_02896/ICPSR_02896_0080_1994/DS0080/02896-0080-Data.dta"))

names(county94) %>% head()

county94$fips <- fiver(county94$fips)

# var026: Foreign born, population % of total population, 1990 (3c)
county94$var026 %>% summary()

county94 <- county94 %>% 
    rename(
        per_pop_foreign94 = var026
    )

cc3 <- left_join(county2, county94 %>% select(fips, per_pop_foreign94), by = "fips")
dim(cc3)

cc3$per_pop_foreign80 <- NA
foreign80_imputed <- NA

for (i in 1:nrow(cc3)) {
    time_series <- c(cc3$per_pop_foreign60[i], cc3$per_pop_foreign70[i], NA, cc3$per_pop_foreign94[i])
    ts_na <- sum(is.na(time_series))
    # linear interpolation
    if(ts_na <=  2) {foreign80_imputed <- imputeTS::na_interpolation(time_series) }

    # simple mean
    if(ts_na == 3) {foreign80_imputed <- imputeTS::na_mean(time_series) }
  
    cc3$per_pop_foreign80[i] <- foreign80_imputed[3]
}



cc3 <- cc3[,grep("fips|tot_pop|per_black|birth_rate|med_age|locgov_exp_pc|per_owner_housing|per_hsplus|per_pop_foreign|med_income|per_unemp|per_urban|per_pop_growth", names(cc3)) ]
cc3 <- cc3 %>% select(-per_pop_foreign94)
cc3 <- cc3[, order(names(cc3) ) ]
cc3 <- cc3 %>% select(fips, fips_st, fips_cty, everything())

names(cc3)
dim(cc3)


names(cc3) <- sub("67|68|69", "70", names(cc3) )
names(cc3) <- sub("77|79", "80", names(cc3) )

names(cc3)


# insert placeholder variables for future imputation
cc3$plag62 <- NA_real_
cc3$plag64 <- NA_real_
cc3$plag66 <- NA_real_
cc3$plag68 <- NA_real_
cc3$plag72 <- NA_real_
cc3$plag74 <- NA_real_
cc3$plag76 <- NA_real_
cc3$plag78 <- NA_real_


## ---- convert_to_time_series_cross_sectional, include = FALSE ----


c4 <- melt(cc3, id = c("fips", "fips_st", "fips_cty"))

# split variableYEAR into two columns
c5 <- separate(
    c4,
    variable,
    sep = "(?<=[[:alpha:]])(?=[[:digit:]])", 
    c("variable", "year")
    )


# add 19 to 60/70/80 year
c5$year <- as.numeric(paste0("19", c5$year))


# transform to wide format data
c6 <- pivot_wider(
        c5,
        id_cols = c(fips, fips_st, fips_cty, year),
        names_from =  variable,
        values_from = value
    )


## ---- impute_missingness_with_imputeTS, include = FALSE ----

miss_fips <- c6 %>% 
    filter(is.na(per_pop_foreign)) %>% 
    pull(fips)


c6 %>% 
    filter(fips %in% miss_fips) %>% 
    group_by(fips) %>% 
    arrange(fips, year) %>% 
    head(2)


# impute using simple interpolation

c7 <- c6 %>% 
    select(-plag) %>%  # drop placeholder variable
    group_by(fips) %>% 
    arrange(fips, year) %>% 
    mutate_if(.predicate = any_na, 
              .funs = function(x) na_interpolation(x)
    ) %>% 
    ungroup()

c7 %>% 
    filter(fips %in% miss_fips) %>% 
    group_by(fips) %>% 
    arrange(fips, year) %>% 
    head(2)

c8 <- c7 %>% 
    filter(year == 1964 | year == 1968 | year == 1972)


## ---- review_missingness ----

gg_miss_var(c8)


## ---- transformations_to_vars, include = FALSE ----

c8 <- c8 %>%
    mutate(
        locgov_exp_pc_log = log(locgov_exp_pc + 1),
        per_black_sq      = (per_black) ^ 2,
        tot_pop_log       = log(tot_pop + 1),
        med_income        = med_income / 1000,
        south2            = fips_st %in% c(01, 13, 22, 28, 45  )
    )


## ---- save_data ----

county3 <- c8


# convert fips to chr with leading zero
county3 <- county3 %>% 
  mutate(fips_st  = str_pad(fips_st,  width = 2, side = "left", pad = "0"),
         fips_cty = str_pad(fips_cty, width = 3, side = "left", pad = "0"))

save(county3, file = here("data/county3_demo.Rdata"))


## ---- make_codebook, eval = FALSE ----

dataMaid::makeCodebook(
  county3, 
  file        = here("codebooks/codebook_county_demographic_data.Rmd"), 
  reportTitle = "County Demographic Data, 1964-72",
  checks      = list(character = NULL, factor = NULL), 
  replace     = TRUE
)
